import os
import pandas as pd
import sys
sys.path.insert(0, '../..')
import itertools
from scipy import stats
import numpy as np
from JKBio.epigenetics import chipseq as chip
from JKBio.utils import helper
from JKBio.utils import plot
import igv
import SimpSOM as sps
from scipy import stats
import seaborn as sns
from matplotlib import cm
from matplotlib import pyplot as plt
from IPython.display import IFrame
import seaborn as sns
from bokeh.plotting import *
import igv
import numba
from numba import jit
from scipy.cluster.hierarchy import linkage, leaves_list
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans, OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from IPython.display import IFrame
from statsmodels.stats.multitest import multipletests
from pybedtools import BedTool
import pyBigWig
output_notebook()
%load_ext autoreload
%autoreload 2
#setting basic parameters
project="Cobinding_ChIP"
version="v3"
merging_version="motifs"
window="150"
#retrieving the data from the previous notebook
%store -r mergedmot
%store -r cols
%store -r annot
%store -r version
%store -r window
%store -r crc
merging_version
for val in mergedmot.columns[72:]:
mergedmot.loc[mergedmot[mergedmot[val]==0].index,val.split('_')[0]]=0
merged = mergedmot[mergedmot.columns[:72]].rename(columns={val.split('_')[0]: val.split('_')[0]+"_filtered" for val in mergedmot.columns[72:]})
#data = pd.DataFrame(np.corrcoef(stats.zscore(0.01+merged[merged.columns[cols:]].T)), columns=merged.columns[cols:], index = merged.columns[cols:])
data = pd.DataFrame(np.corrcoef(merged[merged.columns[cols:]].T.astype(bool)), columns=merged.columns[cols:], index = merged.columns[cols:])
link = linkage(data[:annot-cols])
col = data.iloc[annot-cols:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
rdb = cm.get_cmap('RdBu_r', 256)
for val in col.columns:
a = [rdb(128+int(v*128)) for v in col[val]]
col[val] = a
#correlation_withannotation
fig = sns.clustermap(data.iloc[:annot][data.columns[leaves_list(link)]], vmin=-1,vmax=1, col_colors=col.T, figsize=(10,20), cmap='RdBu_r', row_linkage=link, col_cluster=False, colors_ratio=0.01)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("correlation_withannotation")
#fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_with_annotation.pdf')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_boolcorrelation_with_annotation.pdf')
plt.show()
#data = pd.DataFrame(np.corrcoef(stats.zscore(0.01+merged[merged.columns[cols:]].T)), columns=merged.columns[cols:], index = merged.columns[cols:])
data = pd.DataFrame(np.corrcoef(merged[merged.columns[cols:]].T.astype(bool)), columns=merged.columns[cols:], index = merged.columns[cols:])
link = linkage(data[:annot-cols])
col = data.iloc[annot-cols:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
rdb = cm.get_cmap('RdBu_r', 256)
for val in col.columns:
a = [rdb(128+int(v*128)) for v in col[val]]
col[val] = a
#correlation_withannotation
fig = sns.clustermap(data.iloc[:annot][data.columns[leaves_list(link)]], vmin=-1,vmax=1, col_colors=col.T, figsize=(10,15), cmap='RdBu_r', row_linkage=link, col_cluster=False, colors_ratio=0.008)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("correlation_onbinarized_signal_withannotation")
#fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_with_annotation.pdf')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_boolcorrelation_onbinarized_signal_with_annotation.pdf')
plt.show()
How many of peak in A (column) overlaps with peak in B (rows)
in other words:
what is the percentage of B's peaks that are overlaped by A's peaks
overlap, correlation, pvals, enrichment = chip.pairwiseOverlap(merged, norm=True)
#saving the enrichments
overlap.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_overlap.csv')
correlation.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_correlation_on_overlap.csv')
enrichment.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_enrichment_on_overlap.csv')
pvals.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_enrichment_on_overlap_pvals.csv')
data = pd.DataFrame(data=overlap,index=merged.columns[cols:], columns=merged.columns[cols:])
link = linkage(data.iloc[:annot-cols]) # D being the measurement
c = data[[co for co in data.columns if co not in data.iloc[:annot-cols].index.tolist()]]
viridis = cm.get_cmap('viridis', 256)
for val in c.columns:
a = [viridis(int(v*255)) for v in c[val]]
c[val] = a
fig = sns.clustermap(data.iloc[:annot-cols][data.columns[np.concatenate((leaves_list(link), range(annot,len(data.columns))))]], row_linkage=link, col_colors=c,figsize=(20,18), colors_ratio=0.01, col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("pairwise_overlap_clustermap")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'pairwise_overlap_clustermap.pdf')
plt.show()
on the overlaps given above
data = pd.DataFrame(data = correlation, index = merged.columns[cols:], columns = merged.columns[cols:])
link = linkage(data.iloc[:annot-cols], optimal_ordering=True) # D being the measurement
c = data[[co for co in data.columns if co not in data.iloc[:annot-cols].index.tolist()]]
viridis = cm.get_cmap('viridis', 256)
for val in c.columns:
a = [rdb(128+int(v*128)) for v in c[val]]
c[val] = a
fig = sns.clustermap(data.iloc[:annot-cols][data.columns[np.concatenate((leaves_list(link), range(annot,len(data.columns))))]], row_linkage=link, col_colors=c, colors_ratio=0.01, figsize=(20,18), vmin=-1, vmax=1, cmap='RdBu_r', col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("correlation_on_pairwise_overlap_clustermap")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_onoverlap.pdf')
plt.show()
link = linkage(enrichment, optimal_ordering=True)
fig = sns.clustermap(enrichment,figsize=(20,20), vmin=-5,vmax=5, cmap='RdBu_r', col_linkage=link, row_linkage=link)
fig.ax_col_dendrogram.set_visible(False)
fig.ax_row_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_enrichment_clustermap_all_to_all.pdf')
plt.show()
enrichment.loc[enrichment.columns[leaves_list(link)]][enrichment.columns[leaves_list(link)]].values.max()
link = linkage(enrichment, optimal_ordering=True)
plot.correlationMatrix(enrichment.loc[enrichment.columns[leaves_list(link)]][enrichment.columns[leaves_list(link)]].values, dataIsCorr=True, names=enrichment.columns[leaves_list(link)].tolist(), pvals=pvals[enrichment.columns[leaves_list(link)]].loc[enrichment.columns[leaves_list(link)]].values, size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/',interactive=True,minval=-4,maxval=4)
enrichment = enrichment[crc].loc[crc]
link = linkage(enrichment, optimal_ordering=True)
plot.correlationMatrix(enrichment.loc[enrichment.columns[leaves_list(link)]][enrichment.columns[leaves_list(link)]].values, dataIsCorr=True, names=enrichment.columns[leaves_list(link)].tolist(), pvals=pvals[enrichment.columns[leaves_list(link)]].loc[enrichment.columns[leaves_list(link)]].values, size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/',interactive=True,minval=-4,maxval=4)
I have tried gaussian mixtures and Agglomerative clustering algorithm. Only the second can create a hierarchical clustering.
It seems that gaussian mixture makes more sense given the data we have, for now, is more "homogeneous".
I am still not so happy with the clustering. It can be because of the how much importance, outlier values and the high number of noisy values from locations with no peaks.
We can use similar methods to RNAseq to improve this (clamping values, log transform, first round of PCA..)
data= merged[merged.columns[cols:annot]].values
# using score
scaled_data = stats.zscore(data)
# using regular scaling
#scaled_data = (data-data.min(0))/(data.max(0)-data.min(0))
data.max(0)
#https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
n = 1
n_clust=[10,20,50,100,300]
kmean = KMeans(n_clusters=n_clust[n],n_jobs=8)
groups = kmean.fit_predict(scaled_data)
centroid = kmean.cluster_centers_
df/
df = pd.DataFrame(centroid,columns=merged.columns[cols:annot]).T
df = pd.DataFrame(centroid,columns=merged.columns[cols:annot]).T
#df = (df.T/df.max(1)).T
fig = sns.clustermap(df, vmax=1)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'_centroids.pdf')
merged = merged.reset_index(drop=True)
rand = np.random.choice(merged.index,5000)
subgroups = groups[rand]
sorting = np.argsort(subgroups)
rainb = cm.get_cmap('gist_rainbow', len(set(groups)))
colors = [rainb(i) for i in subgroups[sorting]]
viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[annot:]]
for val in data.columns:
data[val]=np.log2(1+data[val])
data = (data-data.min())/(data.max()-data.min())
data = data.iloc[rand].iloc[sorting]
for val in data.columns:
a = [viridis(int(v*256)) for v in data[val]]
data[val] = a
data["clusters"] = colors
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[cols:annot]].iloc[rand].iloc[sorting].T), vmin=0, vmax=3, figsize=(20,20), z_score=0, colors_ratio=0.01, col_cluster=False,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("sorted clustermap of cobindings clustered with Kmeans")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'_clustermap_cobinding.pdf')
plt.show()
enr, pvals, _ = chip.enrichment(merged, groups=groups)
# enrichment for each cluster
fig = sns.clustermap(enr.T,figsize=(14,20), col_cluster=0, vmax=4, vmin=-4, cmap='RdBu_r')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'enrichment_on_cluster_metasubset_zcorescale.pdf')
plt.show()
pvalue will be informed by opacity
fig = plot.correlationMatrix(enr.loc[enr.columns[leaves_list(link)]][enr.columns[leaves_list(link)]].values, maxokpval=10**-9, pvals= pvals[enr.columns[leaves_list(link)]].loc[enr.columns[leaves_list(link)]], names=enr.columns[leaves_list(link)].tolist(), size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/', interactive=True, minval=-4, maxval=4, dataIsCorr=True)
plot.andrew(groups, merged, annot, cols=8, precise=False, folder = '../results/' + project + '/plots/' + version + '_' + merging_version + '_' + window + '_kmeans_', title = "sorted clustermap of cobindings clustered with Kmeans")
#Import the library
size = 20
#Build a network 20x20 with a weights format taken from the raw_data and activate Periodic Boundary Conditions.
net = sps.somNet(size,size, scaled_data, PBC=True)
#Train the network for 10000 epochs and with initial learning rate of 0.01.
net.train(0.01, 10000)
#Save the weights to file
net.save('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'_cobinding_SOMweights_'+str(size))
net = sps.somNet(0,0, scaled_data, loadFile='../results/'+project+'/'+version+'_'+merging_version+'_'+window+'_cobinding_SOMweights_'+str(size), PBC=True)
pathplot = "../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_somNets/"
! mkdir $pathplot
#Print a map of the network nodes and colour them according to the first feature (column number 0) of the dataset
#and then according to the distance between each node and its neighbours.
for col in range(scaled_data.shape[1]):
net.nodes_graph(colnum=col, printout=True, show=True, path="../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_somNets/", colname=data.columns[col])
diffs = net.diff_graph(show=False, returns=True)
plt.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_cobinding_SOM_'+str(size)+'.pdf')
plot.SOMPlot(net, size=size, colnames=merged.columns[cols:annot], folder='../results/' + project + '/plots/' + version + '_' + merging_version + '_'+window, minweight=0.05)
#Cluster the datapoints according to the Quality Threshold algorithm.
clusts = net.cluster(scaled_data, type='KMeans',numcl=n_clust[n])
sorting = []
for clust in clusts:
sorting.extend(clust)
sorting = np.array(sorting)
viridis = cm.get_cmap('gist_rainbow', len(clusts))
colors=[]
for i, clust in enumerate(clusts):
colors.extend([viridis(i)]*len(clust))
colors = np.array(colors)
viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[annot:]]
for val in data.columns:
data[val] = np.log2(1+data[val])
data = (data-data.min())/(data.max()-data.min())
rand = np.random.choice(range(len(sorting)),5000)
rand.sort()
data = data.iloc[sorting[rand]]
for val in data.columns:
a = [viridis(int(v*256)) for v in data[val]]
data[val] = a
data["clusters"] = [tuple(i) for i in colors[rand]]
#sorted clustermap of cobindings clustered with SOM
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[cols:annot]].iloc[sorting[rand]].T), vmin=0,vmax=3,figsize=(20,15),z_score=0, colors_ratio=0.01, col_cluster=False,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("sorted clustermap of cobindings clustered with SOM")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_SOM_'+str(n_clust)+'_clustermap_cobinding.pdf')
plt.show()
#let's look at the enrichment over SOM clusters
groups = []
for i, val in enumerate(clusts):
groups.extend([i]*len(val))
enr, pvals, _ = chip.enrichment(merged, groups = np.array(groups))
fig = sns.clustermap(enr.T,figsize=(12,12), vmax=4, vmin=-4, cmap='RdBu_r')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'enrichment_on_SOMcluster_metasubset.pdf')
plt.show()
rand = np.random.choice(merged.index,30000)
red_data = TSNE(2,600,verbose=10,n_iter=1500).fit_transform(scaled_data[rand])
np.save('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'red_data.npy',red_data)
# Let's look at the data. the density map of the distribution of all pseudo enhancer over their tf binding profile
_, ax = plt.subplots(figsize=(10,10))
fig = sns.kdeplot(red_data[:,0], red_data[:,1], shade=True, ax=ax)
plt.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_density_TSNE.pdf')
# now let's look at it from anothe rploting method. we can see many many islands
plot.bigScatter(red_data,binsize=0.4,showpoint=False,precomputed=False, logscale=True, title='density plot of enhancers in TF cobinding space with TSNE', folder='../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+"_")
## TODO: color by dependencies/pathways/expression.. after ABC model,